import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import math
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer
from fancyimpute import KNN
from functools import reduce
import plotly.express as px
import plotly.graph_objects as go
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.metrics import (confusion_matrix,
accuracy_score)
import warnings # For handling error messages.
# Don't worry about the following two instructions: they just suppress warnings that could occur later.
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
# We import the dataset that we created in our Data Wrangling section of the Capstone project
# Import the dataset
df=pd.read_csv('/Users/amrita/Desktop/dummies_data.csv')
df.head()
# We start by creating visualization of the target variable
# Our plots suggest that our classes are balanced
f,ax=plt.subplots(1,2,figsize=(18,8))
df['satisfaction'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('satisfaction')
ax[0].set_ylabel('')
sns.countplot('satisfaction',data=df,ax=ax[1])
ax[1].set_title('satisfaction')
plt.show()
54.7% people are satsfied with the Airlines and 45.3% are not satisfied with the Airlines.
df
def generate_group(column_name,kind):
group = df[[column_name,'Age']]
group = group.groupby([column_name]).mean()
if kind == 'bar':
fig = px.bar(group,title = f"<b>Average age by {column_name}</b>")
fig.show()
return group
age = generate_group('satisfaction',kind = 'bar')
Most satisfied customers lie in the age group of 41 years approx and most dissatisfied customers lie in the age group of 38 years (approx)
def generate_group1(column_name,kind):
group1 = df[[column_name,'Flight_Distance']]
group1 = group1.groupby([column_name]).mean()
if kind == 'bar':
fig = px.bar(group1,title = f"<b>Average Flight_Distance by {column_name}</b>")
fig.show()
return group1
Flight_Distance = generate_group1('satisfaction',kind = 'bar')
Most satisfied customers flew a flight distance of 1944.47 miles with the airlines and dissatisfied customers flew around 2025.203 miles with the airlines.
def generate_group3(column_name,kind):
group3 = df[[column_name,'Departure_Delay_in_Minutes']]
group3 = group3.groupby([column_name]).mean()
if kind == 'bar':
fig = px.bar(group3,title = f"<b>Average Departure Delay in Minutes by {column_name}</b>")
fig.show()
return group3
Departure_Delay = generate_group3('satisfaction',kind = 'bar')
Most dissatisfied customers had an average delay of 18 minutes (approx) and dissatisfied customers flew around 2025.203 miles with the airlines.
def generate_group4(column_name,kind):
group4 = df[[column_name,'Arrival_Delay_in_Minutes']]
group4 = group4.groupby([column_name]).mean()
if kind == 'bar':
fig = px.bar(group4,title = f"<b>Average Arrival Delay by {column_name}</b>")
fig.show()
return group4
Arrival_Delay = generate_group4('satisfaction',kind = 'bar')
# Gender and Flight Satisfaction
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['Male','satisfaction']].groupby(['Male']).mean().plot.bar(ax=ax[0])
sns.countplot('Male',hue='satisfaction',data=df,ax=ax[1])
ax[1].set_title('gender: Satisfied vs Not Satisfied')
ax[0].set_title('satisfaction vs gender')
plt.show()
1st plot: Females are more satisfied than males. 2nd plot: Females are more satisfied with the airlines. Males are more dissatisfied with the airlines
# Disloyal Customer and Flight Satisfaction
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['disloyal_Customer','satisfaction']].groupby(['disloyal_Customer']).mean().plot.bar(ax=ax[0])
sns.countplot('disloyal_Customer',hue='satisfaction',data=df,ax=ax[1])
ax[1].set_title('disloyal_Customer: Satisfied vs Not Satisfied')
ax[0].set_title('satisfaction vs disloyal_Customer')
plt.show()
1st plot: Disloyal customers of the airlines are less satisfied compared to loyal customer 2nd plot: Loyal customers of the airlines are more satisfied than not satisfied; Disloyal customers are more dissatisfied than satisfied.
# Travel type vs Flight Satisfaction
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['Personal_Travel','satisfaction']].groupby(['Personal_Travel']).mean().plot.bar(ax=ax[0])
sns.countplot('Personal_Travel',hue='satisfaction',data=df,ax=ax[1])
ax[1].set_title('Personal_Travel: Satisfied vs Not Satisfied')
ax[0].set_title('satisfaction vs Personal_Travel')
plt.show()
1st plot: Customers doing business travel are more satisfied compared to personal travel customers 2nd plot: Customers doing business travel are more satisfied than not satisfied; customers doing personal travel are more dissatisfied than satisfied.
pd.crosstab(df.Cleanliness,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
A cleanliness level of 4 and 5 yields higher satsfaction among customers.
df.columns
pd.crosstab(df.Seat_comfort,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
A seat comfort level of 4 and 5 yields higher satsfaction among customers.
pd.crosstab(df.Inflight_entertainment,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
An Inflight_entertainment level of 4 and 5 yields higher satsfaction among customers suggesting better inflight entertainment provides higher satisfaction.
pd.crosstab(df.Ease_of_Online_booking,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
A better ease_of_online_booking service will yield more satisfaction among customers as suggested by the ratings of 4 and 5 for ease_of_Online_booking service.
pd.crosstab(df.Inflight_wifi_service,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
Inflight wifi service doesn't hold much importance for satisfaction because we can see that even if inflight wifi service has a got a rating of 2, the number of customers satisfied is more than no of customers dissatisfied
pd.crosstab(df.On_board_service,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
Better onboard service will yield better satisfaction among customers as suggested by the ratings of 4 and 5 for onboard service.
pd.crosstab(df.Leg_room_service,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
Better leg room service will yield better satisfaction among customers as suggested by the ratings of 4 and 5 for leg room service
pd.crosstab(df.Food_and_drink,df.satisfaction, margins=True).style.background_gradient(cmap='summer_r')
Better food and drink will yield better satisfaction among customers as suggested by the ratings of 4 and 5 for Food and drink